# MIR test/train cross validation

library(tidyverse)
library(data.table)
library(dtplyr)
library(tidytext)

source("~/Dropbox/CHAMP-Net/coronavirus_paper/data_and_code/replication_file/scripts/congress_tweets_functions.R")

covid.tweets.nrt <- readr::read_csv("~/Dropbox/CHAMP-Net/coronavirus_paper/data_and_code/covid_tweets_nrt_4120.csv")

covid.tweets.nrt <- data.table(covid.tweets.nrt)
covid.tweets.nrt[,lowtext := stringr::str_replace_all(lowtext, "\\s(#china[\\w]+)", " china_hashtag")]

covid.tweets.nrt <- as_tibble(covid.tweets.nrt)

zz <- file("cv_outfile.txt", open = "wt")
sink(zz, append = TRUE, type = "message")

full_MIR_cv <- MIR_to_pred_multifolds(dat = covid.tweets.nrt %>% filter(date < "2020-04-01"),
                                   outvar = "is_republican",
                                   text_col = "lowtext",
                                   doc_col = "index",
                                   to_compound = c("green new deal","nuclear power","cap and trade","clean coal", "gun violence","national security",
                                                   "tax cut","cut tax","cut taxes", "president trump", "american people", "trump administration",
                                                   "medicare for all","health care", "health insurance",
                                                   "single payer", "assault weapon","assault weapons","semi automatic","second amendment","brady bill",
                                                   "high capacity magainze","high capacity magazines","bump stock","bump stocks","background check",
                                                   "background checks","build the wall","wall funding","birthright citizenship","nuclear deal",
                                                   "pro life","pro choice","anti choice","born alive","partial birth","late term",
                                                   "house democrats","house republicans","senate democrats","senate republicans",
                                                   "majority leader","minority leader","town hall","donald trump","social security",
                                                   "law enforcement","preexisting conditions",
                                                   
                                                   # covid compounds
                                                   "world health organization","centers for disease control",
                                                   "supply chain","shelter in place","defense production act",
                                                   "personal protective equipment","laid off", "town hall",
                                                   "wish list","op ed"),
                                   term_min = 100/nrow(covid.tweets.nrt),
                                   date_var = "day_relative_1120",
                                   ngrams = c(1:3))
sink()

data.table::fwrite(full_MIR_cv$preds, file = "~/Dropbox/CHAMP-Net/coronavirus_paper/data_and_code/replication_file/output/fullcv_MIR_preds.csv")
data.table::fwrite(full_MIR_cv$fit_results, file = "~/Dropbox/CHAMP-Net/coronavirus_paper/data_and_code/replication_file/output/fullcv_MIR_cvfit.csv")
